Introduction to ggplot

A simple example

  • The functions in the ggplot2 package build up a graph in layers.
  • We’ll build a a complex graph by starting with a simple graph and adding additional elements, one at a time.

We are going to download data from the III Forest Inventory (Murcia)

##load data
FI_Murcia <- read.csv("~/Dropbox/Rcourse/day1/FI_Murcia.csv")
attach(FI_Murcia)
names(FI_Murcia)
## [1] "Species"   "Shape"     "Height"    "Diameter1" "Diameter2"
##specify dataset and mapping
library(ggplot2) 
ggplot(data = FI_Murcia,
  mapping = aes(x = Diameter1, y = Height))
Map variables

Map variables

We need to specify what we wanted placed on the graph

##add geom_point
library(ggplot2) 
ggplot(data = FI_Murcia,
  mapping = aes(x = Diameter1, y = Height))+
  geom_point()

We can change the color with color

##make points blue, larger, and semi-transparent
ggplot(data = FI_Murcia,
mapping = aes(x = Diameter1, y = Height)) +
  geom_point(color = "cornflowerblue", alpha = .7,
  size = 3)

And also we can add a trend line (in this case it’s not really informative :( )

##add a line of best fit
ggplot(data = FI_Murcia,
mapping = aes(x = Diameter1, y = Height)) +
  geom_point(color = "cornflowerblue", alpha = .7, size = 3) + 
  geom_smooth(method = "lm")

Also, we can differentiate different levels to plot them separately

##indicate species using color
FI_Murcia_sub = subset(FI_Murcia, Species == c("Castanea sativa", "Fagus sylvatica", "Pinus nigra", "Quercus robur"))
ggplot(data = FI_Murcia_sub, 
mapping = aes(x = Diameter1, y = Height, color = Species)) + 
  geom_point(alpha = .7, size = 3) + 
  geom_smooth(method = "lm", se = FALSE, size = 1.5)

And also, we can plot different charts for each species

# reproduce plot for each species
ggplot(data = FI_Murcia_sub, 
mapping = aes(x = Diameter1, y = Height)) + 
  geom_point(alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_x_continuous(breaks = seq(0, 2, 0.5)) + 
  scale_y_continuous(breaks = seq(0, 60, 10)) + 
  scale_color_manual(values = c("indianred3","cornflowerblue")) +
  facet_wrap(~Species)

Univariate Graphs - Categorical

Plot the distribution of species

library(ggplot2)
ggplot(FI_Murcia_sub, aes(x = Species)) + 
  geom_bar()

Plot the distribution of species with modified colors and labels

ggplot(FI_Murcia_sub, aes(x = Species)) + 
  geom_bar(fill = "cornflowerblue", color="black") + 
  labs(x = "Species", y = "Frequency", 
       title = "Individuals by species")

Plot the distribution as percentages

ggplot(FI_Murcia_sub, aes(x = Species, y = ..count.. / sum(..count..))) + 
  geom_bar() + labs(x = "Species", y = "Percent", 
                    title = "Individuals by species") + 
  scale_y_continuous(labels = scales::percent)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Calculate number of participants in each species category

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
plotdata <- FI_Murcia_sub %>%
  count(Species)
plotdata
##           Species    n
## 1 Castanea sativa   27
## 2 Fagus sylvatica 1441
## 3     Pinus nigra  130
## 4   Quercus robur  197

Plot the bars in ascending order

# plot the bars in ascending order
ggplot(plotdata, aes(x = reorder(Species, n), y = n)) + 
  geom_bar(stat = "identity") +
  labs(x = "Species", y = "Frequency", 
       title = "Individuals by species")

Plot the bars in descending order

ggplot(plotdata, aes(x = reorder(Species, -n), y = n)) + 
  geom_bar(stat = "identity") +
  labs(x = "Species", y = "Frequency", 
       title = "Individuals by species")

plot the bars with numeric labels geom_text adds the labels, vjust controls vertical justification

ggplot(plotdata, aes(x = reorder(Species, n), y = n)) + 
  geom_bar(stat = "identity") + 
  geom_text(aes(label = n), vjust=-0.5) +
  labs(x = "Species", y = "Frequency", 
       title = "Individuals by species")

plotdata <- FI_Murcia_sub %>% count(Species) %>% mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))

Plot the bars as percentages, in descending order with bar labels

ggplot(plotdata, aes(x = reorder(Species, -pct), y = pct)) +
  geom_bar(stat = "identity", fill = "indianred3", color = "black") + 
  geom_text(aes(label = pctlabel), vjust = -0.25) + 
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Species", y = "Percent", title = "Individuals by species")

Overlapping labels

Horizontal bar chart

ggplot(FI_Murcia_sub, aes(x = Species)) + 
  geom_bar() +
  labs(x = "", y = "Frequency", title = "Individuals by shape") + coord_flip()

Univariate Graphs - Quantitative

Histogram

Plot the age distribution using a histogram

ggplot(FI_Murcia_sub, aes(x = Height)) + geom_histogram() +
  labs(title = "Individuals by Height", x = "Height")

Plot the histogram with blue bars and white borders

ggplot(FI_Murcia_sub, aes(x = Height)) + 
  geom_histogram(fill = "cornflowerblue", color = "white") + 
  labs(title = "Individuals by Height", x = "Height")

Plot the histogram with 20 bins

ggplot(FI_Murcia_sub, aes(x = Height)) + 
  geom_histogram(fill = "cornflowerblue", color = "white", bins = 20) + 
  labs(title="Individuals by Height", subtitle = "number of bins = 20", x = "Height")

Plot the histogram with a binwidth of 5

ggplot(FI_Murcia_sub, aes(x = Height)) + 
  geom_histogram(fill = "cornflowerblue", color = "white", binwidth = 5) + 
  labs(title="Individuals by Height", subtitle = "binwidth = 5 meters", x = "Height")

Plot the histogram with percentages on the y-axis

library(scales) 
ggplot(FI_Murcia_sub, aes(x = Height, y= ..count.. / sum(..count..))) +
  geom_histogram(fill = "cornflowerblue", color = "white", binwidth = 5) + 
  labs(title="Individuals by Height", y = "Percent", x = "Height") + 
  scale_y_continuous(labels = percent)

There are more things to add some “fantasy” to your plots as:

  • Kernel density plot
  • Dotplot
  • etc.

Bivariate Graphs

Categorical vs Categorical

Stacked bar chart

FI_Murcia_sub$Shape<- as.factor(FI_Murcia_sub$Shape)
ggplot(FI_Murcia_sub,
aes(x = Species,
fill = Shape)) + geom_bar(position = "stack")

From the chart, we can see for example, that the most common species is F. sylvatica. An the most common shape for all the species is 2

Stacked is the default, so the last line could have also been written as geom_bar().

Grouped bar chart

ggplot(FI_Murcia_sub,
      aes(x = Species,
          fill = Shape)) + geom_bar(position = "dodge")

By default, zero count bars are dropped and the remaining bars are made wider. This may not be the behavior you want. You can modify this using the position = position_dodge(preserve = "single") option.

ggplot(FI_Murcia_sub,
      aes(x = Species,
          fill = Shape)) + 
        geom_bar(position = position_dodge(preserve="single"))

Segmented bar chart

FI_Murcia_sub$Shape<- as.factor(FI_Murcia_sub$Shape)
ggplot(FI_Murcia_sub,
      aes(x = Species,
          fill = Shape)) + 
        geom_bar(position = "fill") + labs(y="Proportion")

This type of plot is particularly useful if the goal is to compare the percentage of a category in one variable across each level of another variable.

Quantitative vs Quantitative

The simplest display of two quantitative variables is a scatterplot, with each variable represented on an axis.

ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) + 
  geom_point()

Scatterplot with linear fit line

ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) + 
  geom_point(color="steelblue") +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Clearly, Height increases with Diameter. However a straight line does not capture this non-linear effect. A polynomial regression line will fit better here.

Scatterplot with quadratic line of best fit

ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) + 
  geom_point(color="steelblue") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2),
  color = "indianred3")

Finally, a smoothed nonparametric fit line can often provide a good picture of the relationship. The default in ggplot2 is a loess line which stands for for locally weighted scatterplot smoothing.

ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) + 
  geom_point(color = "steelblue") +
  geom_smooth(color = "tomato")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Line plot

When one of the two variables represents time, a line plot can be an effective method of displaying relationship.

library(curl)
## Using libcurl 8.4.0 with LibreSSL/3.3.6
rats <- read.csv("~/Dropbox/Rcourse/day1/Rats.csv")
ggplot(rats, 
       aes(x = time, y = captures)) + 
  geom_line()

Line plot with points and improved labeling

ggplot(rats,
  aes(x = time, y = captures)) +
geom_line(size = 1.5, color = "lightgrey") + geom_point(size = 3, color = "steelblue") +
labs(y = "Individuals captured", x = "Ocasion",
title = "Removal method data", subtitle = "An Attempt to Determine the Absolute Number of Rats on a Given Area",
caption = "Leslie and Davies (1939)")

Categorical vs Quantitative

When plotting the relationship between a categorical variable and a quantitative variable, a large number of graph types are available.

These include bar charts using summary statistics, grouped kernel density plots, side-by-side box plots, side-by-side violin plots, and Cleveland plots.

Bar chart (on summary statistics)

# calculate mean height for each species
plotdata <- FI_Murcia_sub %>%
    group_by(Species) %>% 
    summarize(mean_Height = mean(Height))
# plot mean height
ggplot(plotdata, 
       aes(x = Species, y = mean_Height)) + 
  geom_bar(stat = "identity")

Plot mean heights in a more attractive fashion

library(scales) 
ggplot(plotdata,
      aes(x = factor(Species,
          labels = c("Castanea\nsativa", "Fagus\nsylvatica", "Pinus\nnigra", "Quercus\nrobur")), y = mean_Height)) +
    geom_bar(stat = "identity", fill = "cornflowerblue") +
    geom_text(aes(label = round(mean_Height, 2)), 
      vjust = -0.2) +
  labs(title = "Mean Height by tree species",
    subtitle = "National Forest Inventory (Murcia)", x = "",
    y = "m")

Grouped kernel density plots

One can compare groups on a numeric variable by superimposing kernel density plots in a single graph.

By rank using kernel density plots

ggplot(FI_Murcia_sub,
    aes(x = Height, fill = Species)) +
  geom_density(alpha = 0.4) +
  labs(title = "Height distribution by species")

alpha values range from 0 (transparent) to 1 (opaque)

Box plots

Plot the distribution of height by species using boxplot

ggplot(FI_Murcia_sub,
      aes(x = Species, y = Height)) + 
  geom_boxplot() +
labs(title = "Height distribution by species")

Violin plots

Violin plots are similar to kernel density plots, but are mirrored and rotated 90\(^\circ\) .

ggplot(FI_Murcia_sub,
      aes(x = Species, y = Height)) +
      geom_violin() +
      labs(title = "Height distribution by species")

A useful variation is to superimpose boxplot on violin plots.

ggplot(FI_Murcia_sub,
      aes(x = Species, y = Height)) +
      geom_violin(fill = "cornflowerblue") + 
      geom_boxplot(width = .2, fill = "orange", outlier.color = "orange", outlier.size = 2) +
      labs(title = "Height distribution by species")

Cleveland Dot Charts

Cleveland plots are useful when you want to compare a numeric statistic for a large number of groups.

plotdata<-FI_Murcia %>% 
    group_by(Species) %>%
    mutate(Mean = mean(Height, na.rm=TRUE))
ggplot(plotdata,
      aes(x = Mean, y = Species)) +
      geom_point()

ggplot(plotdata,
      aes(x = Mean, y = reorder(Species,Mean))) +
      geom_point()

Multivariate Graphs

Grouping

In grouping, the values of the first two variables are mapped to the x and y axes. Then additional variables are mapped to other visual characteristics such as color, shape, size, line type, and transparency. Grouping allows you to plot the data for multiple groups in a single graph.

library(ggplot2)
ggplot(Fires, 
      aes(x = speed..km.day.1., y=Size..km2.)) + 
      geom_point() +
      labs(title = "Size (km2) by speed (km/day)")

Next, let’s include the Regions, using color.

ggplot(Fires, 
      aes(x = speed..km.day.1., y=Size..km2., 
      color = dominant.spread.direction)) + geom_point() +
      labs(title = "Size (km2) by speed (km/day) and region")

Finally, let’s add the shape of the trees, using the shape of the points to indicate shape. We’ll increase the point size and add transparency to make the individual points clearer.

ggplot(Fires, 
      aes(x = speed..km.day.1., y=Size..km2., 
      color = dominant.spread.direction, shape = Region)) + geom_point() +
      labs(title = "Size (km2) by speed (km/day), region and dominant spread direction")

ggplot(Fires, 
  aes(x = speed..km.day.1., y=Size..km2., color = Region)) + 
  geom_point(alpha = .4, size = 3) +
  geom_smooth(se=FALSE, method = "lm", formula = y~poly(x,2), size = 1.5) + 
  labs(x = "Speed (km/day)", y = "Size (km2)", title = "Size (km2) by speed (km/day) and region", 
    subtitle = "Summary of Large Forest Fires", y = "", color = "Regions") +
    scale_color_brewer(palette = "Set1") + theme_minimal()

Grouping allows you to plot multiple variables in a single graph, using visual characteristics such as color, shape, and size.

In faceting, a graph consists of several separate plots or small multiples, one for each level of a third variable, or combination of variables. It is easiest to understand this with an example.

ggplot(Fires, aes(x = Size..km2.)) + geom_histogram(fill = "cornflowerblue",
    color = "white") + facet_wrap(~Region, ncol = 1) +
    labs(title = "Size (km2) by species")

The facet_wrap function creates a separate graph for each species. The ncol option controls the number of columns.

ggplot(Fires, aes(x = Size..km2.)) + 
  geom_histogram(fill = "cornflowerblue", color = "white") + 
  facet_wrap(Region ~ dominant.spread.direction) +
  labs(title = "Size histograms by Region")

The format of the facet_grid function is facet_grid( row variable(s) ~ column variable(s))

# plot total area by year, for each country
ggplot(plotdata, aes(x=year, y = total)) + geom_line(color="grey") + 
geom_point(color="blue") + facet_wrap(~Region) + theme_minimal(base_size = 9) + 
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
labs(title = "Changes in burned area",x = "Year",y = "Total area")

Maps

library(rworldmap) 
# get map
MyMap <- ggplot() + borders("world", colour="black", fill="grey") + theme_bw()
MyMap

MyMap  +
  geom_point(data = Fires, aes(x=lon, y=lat),size = 2, color = "red")+
  theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25), 
        panel.background = element_rect(fill = "aliceblue"))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(viridis)
MyMap  + 
  geom_point(data = Fires, aes(x=lon, y=lat, color = Size..km2.),size = 2) +
  scale_color_viridis() +
  theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25), 
        panel.background = element_rect(fill = "aliceblue"))

library(viridis)
MyMap  + coord_fixed(xlim=c(-10, 37.5), ylim=c(30, 48)) + 
  geom_point(data = Fires, aes(x=lon, y=lat, color = Size..km2.),size = 2) +
  scale_color_viridis() +
  theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25), 
        panel.background = element_rect(fill = "aliceblue"))

MyMap  + coord_fixed(xlim=c(-10, 37.5), ylim=c(30, 48)) + 
  geom_point(data = Fires, aes(x=lon, y=lat, color = Size..km2.),size = 2) +
  scale_color_viridis() +
  facet_wrap(~year) +
  theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25), 
        panel.background = element_rect(fill = "aliceblue"))

Time-dependent graphs

Time series

ggplot(pm10subset, aes(x = day, y = pm10)) +
      geom_line() +
      labs(title = "Levels of pm10 during 2014", x = "Day of the year", y = "pm10 concentration")

library(scales)
ggplot(pm10subset, aes(x = day, y = pm10)) +
    geom_line(color = "indianred3", size=1 ) +
    geom_smooth() +  labs(title = "Pm10 levels", subtitle = "2014", x = "DOY", y = "pm10 concentration") +
    theme_minimal()

# multivariate time series
attach(pm10)
ggplot(pm10,
    aes(x=day , y= pm10, color=site)) + geom_line(size=1) + 
    labs(title = "pm10 levels London",
    subtitle = "2014", caption = "source: OpenAir", y = "pm10 concentration") +
    theme_minimal() + scale_color_brewer(palette = "Dark2")

Area Charts

ggplot(pm10subset, aes(x = day, y = pm10)) + 
  geom_area(fill="lightblue", color="black") + 
  labs(title = "pm10 levels", x = "day", y = "pm10 concentration")

A stacked area chart can be used to show differences between groups over time.

ggplot(pm10, aes(x = day, y = pm10, fill= site)) + 
  geom_area() + 
  labs(title = "pm10 levels", x = "day", y = "pm10 concentration")

Statistical models

Correlation plots

The levels of the site variable can be reversed using the fct_rev function in the forcats package.

df <- dplyr::select_if(FI_Murcia, is.numeric)
r <- cor(df, use="complete.obs") 
round(r,2)
##           Shape Height Diameter1 Diameter2
## Shape      1.00  -0.52     -0.22     -0.22
## Height    -0.52   1.00      0.67      0.67
## Diameter1 -0.22   0.67      1.00      0.99
## Diameter2 -0.22   0.67      0.99      1.00
library(ggcorrplot) 
ggcorrplot(r)

By default, zero count bars are dropped and the remaining bars are made wider. This may not be the behavior you want. You can modify this using the position = position_dodge(preserve = "single") option.

ggcorrplot(r,
            hc.order = TRUE,
            type = "lower", lab = TRUE)

Linear Regression

Height_lm <- lm(Height ~ Diameter1 + Diameter2, data = FI_Murcia)
library(visreg)
#The visreg function takes (1) the model and (2) the variable of interest and plots 
#the conditional relationship, controlling for the other variables. 
#The option gg = TRUE is used to produce a ggplot2 graph.
visreg(Height_lm, "Diameter1", gg = TRUE) +
  labs(title = "Relationship between Height and Diameter1",
  caption = "source:   NFI Murcia",
y = "Height (m)",
x = "Diameter (m)")

Logistic regression

visreg(Chrod_glm, "altitude",
    gg = TRUE,
    scale="response") + labs(y = "Prob(Presence)", x = "Altitude",
    title = "Relationship of Age and Presence")